--- jupyter: jupytext: text_representation: extension: .md format_name: markdown format_version: '1.3' jupytext_version: 1.14.6 kernelspec: display_name: Python 3 (ipykernel) language: python name: python3 --- # SPARQL queries for NifVector ## From phrase to contexts in which the phrase is used This produces the context vector of a phrase ```console SELECT DISTINCT ?v (sum(?count) as ?n) WHERE { { nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?w nifvec:hasCount ?count . ?w nifvec:hasContext ?c . ?c rdf:value ?v . } } GROUP BY ?v ORDER BY DESC(?n) ``` ## From context to phrases that are used in the context This produces the phrase vector of a context ```console SELECT distinct ?v (sum(?count) as ?num) WHERE { { nifvec:isContextOf ?w . ?w rdf:type nifvec:Window . ?w nifvec:hasCount ?count . ?p nifvec:isPhraseOf ?w . ?p rdf:value ?v . } } GROUP BY ?v ORDER BY DESC(?num) ``` ## Most similar phrases of a phrase ```console SELECT distinct ?v (count(?c) as ?num1) WHERE { { { SELECT DISTINCT ?c (sum(?count1) as ?n1) WHERE { nifvec:isPhraseOf ?w1 . ?w1 rdf:type nifvec:Window . ?w1 nifvec:hasContext ?c . ?w1 nifvec:hasCount ?count1 . } GROUP BY ?c ORDER BY DESC(?n1) LIMIT topcontexts } ?c nifvec:isContextOf ?w . ?p nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?p rdf:value ?v . } } GROUP BY ?v ORDER BY DESC (?num1) ``` ## Most similar phrases of a phrase with a context ```console SELECT distinct ?v (count(?c) as ?num1) WHERE { { { SELECT DISTINCT ?c (sum(?count1) as ?n1) WHERE { nifvec:isPhraseOf ?w1 . ?w1 rdf:type nifvec:Window . ?w1 nifvec:hasContext ?c . ?w1 nifvec:hasCount ?count1 . } GROUP BY ?c ORDER BY DESC(?n1) LIMIT topcontexts } { SELECT DISTINCT ?p (sum(?count2) as ?n2) WHERE { nifvec:isContextOf ?w2 . ?w2 rdf:type nifvec:Window . ?w2 nifvec:hasPhrase ?p . ?w2 nifvec:hasCount ?count2 . } GROUP BY ?p ORDER BY DESC(?n2) LIMIT topphrases } ?c nifvec:isContextOf ?w . ?p nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?p rdf:value ?v . } } GROUP BY ?v ORDER BY DESC (?num1) ``` ```python from collections import Counter # phrases q = """ SELECT distinct ?v ?l ?r (sum(?count) as ?num) WHERE { ?p rdf:type nif:Phrase . ?p nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?w nifvec:hasContext ?c . ?w nifvec:hasCount ?count . ?p rdf:value ?v . ?c nifvec:hasLeftValue ?l . ?c nifvec:hasRightValue ?r . } GROUP BY ?v ?l ?r ORDER BY DESC (?num) """ results_phrases = list(g.query(q)) d_phrases = defaultdict(Counter) for r in results_phrases: d_phrases[r[0].value][(r[1].value, r[2].value)] = r[3].value with open('..//data//phrase_contexts_vectors.pickle', 'wb') as handle: pickle.dump(d_phrases, handle, protocol=pickle.HIGHEST_PROTOCOL) ``` ```python from collections import Counter # contexts q = """ SELECT distinct ?l ?r ?v (sum(?count) as ?num) WHERE { ?c nifvec:hasLeftValue ?l . ?c nifvec:hasRightValue ?r . ?w nifvec:hasContext ?c . ?w nifvec:hasPhrase ?p . ?w rdf:type nifvec:Window . ?w nifvec:hasCount ?count . ?p rdf:value ?v . } GROUP BY ?l ?r ?v ORDER BY DESC (?num) """ results_context_phrases = list(g.query(q)) d_context_phrases = defaultdict(Counter) for r in results_context_phrases: d_context_phrases[(r[0].value, r[1].value)][r[2].value] = r[3].value with open('..//data//context_phrases_vectors.pickle', 'wb') as handle: pickle.dump(d_context_phrases, handle, protocol=pickle.HIGHEST_PROTOCOL) ``` ```python # lemmas q = """ SELECT distinct ?v ?l ?r (sum(?count) as ?num) WHERE { { ?p rdf:value ?v . { ?e ontolex:canonicalForm [ ontolex:writtenRep ?v ] . } UNION { ?e ontolex:otherForm [ ontolex:writtenRep ?v ] . } ?e ontolex:otherForm|ontolex:canonicalForm [ ontolex:writtenRep ?f ] . ?lemma rdf:value ?f . ?lemma nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?w nifvec:hasContext ?c . ?w nifvec:hasCount ?count . ?c nifvec:hasLeftValue ?l . ?c nifvec:hasRightValue ?r . } } GROUP BY ?v ?l ?r ORDER BY DESC (?num) """ results_lemmas = list(g.query(q)) d_lemmas = defaultdict(Counter) for r in results_lemmas: d_lemmas[r[0].value][(r[1].value, r[2].value)] = r[3].value with open('..//data//lemma_contexts_vectors.pickle', 'wb') as handle: pickle.dump(d_lemmas, handle, protocol=pickle.HIGHEST_PROTOCOL) ``` ```python # q = """ # SELECT distinct ?l ?r ?v (sum(?count) as ?num) # WHERE # { # { # ?c nifvec:hasLeftValue ?l . # ?c nifvec:hasRightValue ?r . # ?w rdf:type nifvec:Window . # ?w nifvec:hasContext ?c . # ?w nifvec:hasCount ?count . # ?w nifvec:hasPhrase ?p . # ?p rdf:value ?f . # { # ?e ontolex:canonicalForm [ ontolex:writtenRep ?f ] . # } # UNION # { # ?e ontolex:otherForm [ ontolex:writtenRep ?f ] . # } # ?e ontolex:otherForm|ontolex:canonicalForm [ ontolex:writtenRep ?v ] . # ?lemma rdf:value ?v . # } # } # GROUP BY ?l ?r ?v # ORDER BY DESC (?num) # """ # results_context_lemmas = list(g.query(q)) # d_context_lemmas = defaultdict(Counter) # for r in results_context_lemmas: # d_context_lemmas[(r[0].value, r[1].value)][r[2].value] = r[3].value # with open('..//data//context_lemmas_vectors.pickle', 'wb') as handle: # pickle.dump(d_context_lemmas, handle, protocol=pickle.HIGHEST_PROTOCOL) ```